In [30]:
import numpy as np
import pandas as pd
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
In [2]:
train = pd.read_csv('train.tsv', sep='\t', header=0)
test = pd.read_csv('test.tsv', sep='\t', header=0)
In [3]:
train.shape, test.shape
Out[3]:
In [4]:
train.head()
Out[4]:
In [5]:
test.head()
Out[5]:
In [6]:
raw_docs_train = train['Phrase'].values
raw_docs_test = test['Phrase'].values
sentiment_train = train['Sentiment'].values
num_labels = len(np.unique(sentiment_train))
In [7]:
np.unique(sentiment_train)
Out[7]:
In [8]:
stop_words = set(stopwords.words('english'))
print (stop_words)
In [9]:
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
print (stop_words)
In [10]:
stemmer = SnowballStemmer('english')
In [11]:
print "pre-processing train docs..."
processed_docs_train = []
for index, doc in enumerate(raw_docs_train):
tokens = word_tokenize(doc)
filtered = [word for word in tokens if word not in stop_words]
stemmed = [stemmer.stem(word) for word in filtered]
processed_docs_train.append(stemmed)
if index == 0:
print ('\n')
print (doc)
print ('\n')
print (tokens)
print ('\n')
print (filtered)
print ('\n')
print (stemmed)
In [12]:
print "pre-processing test docs..."
processed_docs_test = []
for doc in raw_docs_test:
tokens = word_tokenize(doc)
filtered = [word for word in tokens if word not in stop_words]
stemmed = [stemmer.stem(word) for word in filtered]
processed_docs_test.append(stemmed)
In [13]:
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)
In [14]:
dictionary = corpora.Dictionary(processed_docs_all)
dictionary_size = len(dictionary.keys())
print "dictionary size: ", dictionary_size
In [15]:
dictionary[0], dictionary[14]
Out[15]:
In [16]:
print "converting to token ids..."
word_id_train, word_id_len = [], []
for index,doc in enumerate(processed_docs_train):
word_ids = [dictionary.token2id[word] for word in doc]
word_id_train.append(word_ids)
word_id_len.append(len(word_ids))
if index == 0:
print (doc)
print (word_ids)
print (word_id_train)
print (word_id_len)
In [17]:
word_id_test, word_ids = [], []
for doc in processed_docs_test:
word_ids = [dictionary.token2id[word] for word in doc]
word_id_test.append(word_ids)
word_id_len.append(len(word_ids))
In [18]:
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
print (np.mean(word_id_len))
print (np.std(word_id_len))
print (seq_len)
In [19]:
#pad sequences
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)
In [20]:
print (word_id_train)
In [21]:
print (y_train_enc)
Long short-term memory (LSTM) is a recurrent neural network (RNN) architecture that remembers values over arbitrary intervals. Stored values are not modified as learning proceeds. RNNs allow forward and backward connections between neurons.
An LSTM network contains LSTM units instead of, or in addition to, other network units. An LSTM unit remembers values for either long or short time periods. The key to this ability is that it uses no activation function within its recurrent components. Thus, the stored value is not iteratively modified and the gradient does not tend to vanish when trained with backpropagation through time.
keras.layers.recurrent.LSTM(units, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.0)
units: Positive integer, dimensionality of the output space.
dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.
recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state.
Embedding Layer
Embedding Layer is used to:
keras.layers.embeddings.Embedding(input_dim, output_dim, embeddings_initializer='uniform', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=False, input_length=None)
Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
This layer can only be used as the first layer in a model.
Example:
model.add(Embedding(1000, 64, input_length=10))
In the above example code, the model will take as input an integer matrix of size (batch, input_length). The largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
Now model.output_shape == (None, 10, 64)
, where None is the batch dimension.
In [27]:
#LSTM
print "fitting LSTM ..."
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
In [28]:
model.fit(word_id_train, y_train_enc, epochs=3, batch_size=256, verbose=1)
Out[28]:
We add a one-dimensional CNN Conv1D() and a max pooling layer MaxPooling1D() after the Embedding layer which then feed the features to the LSTM. We use a set of 32 features with filter length of 3. The pooling layer has the standard length of 2 to halve the feature map size.
In [31]:
#LSTM
print "fitting LSTM ..."
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# sigmoid activation for binary classification
# softmax activation for multi-class classification
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
In [32]:
model.fit(word_id_train, y_train_enc, epochs=3, batch_size=256, verbose=1)
Out[32]:
In [33]:
test_pred = model.predict_classes(word_id_test)
In [36]:
test_pred
Out[36]:
In [37]:
#make a submission
test['Sentiment'] = test_pred.reshape(-1,1)
header = ['PhraseId', 'Sentiment']
test.to_csv('./submission_lstm_cnn.csv', columns=header, index=False, header=True)